We obtain the Student Survey and Golden State Warrior data from the web. These files are part of the Lock^5 3rd Ed. data files.
Survey = read.csv("http://facweb1.redlands.edu/fac/jim_bentley/Data/Lock5Ed3/Lock5Data3eCSV/StudentSurvey.csv")
names(Survey)
## [1] "Year" "Sex" "Smoke" "Award" "HigherSAT"
## [6] "Exercise" "TV" "Height" "Weight" "Siblings"
## [11] "BirthOrder" "VerbalSAT" "MathSAT" "SAT" "GPA"
## [16] "Pulse" "Piercings"
GSW = read.csv("http://facweb1.redlands.edu/fac/jim_bentley/Data/Lock5Ed3/Lock5Data3eCSV/GSWarriors2019.csv")
names(GSW)
## [1] "Game" "Date" "Location" "Opp" "Win"
## [6] "Points" "FG" "FGA" "FG3" "FG3A"
## [11] "FT" "FTA" "Rebounds" "OffReb" "Assists"
## [16] "Steals" "Blocks" "Turnovers" "Fouls" "OppPoints"
## [21] "OppFG" "OppFGA" "OppFG3" "OppFG3A" "OppFT"
## [26] "OppFTA" "OppRebounds" "OppOffReb" "OppAssists" "OppSteals"
## [31] "OppBlocks" "OppTurnovers" "OppFouls"
Suppose we want to figure out predictors of GPA. For college students from St. Lawrence University, what helps determine student success?
### Plot all of the variables against each other
pairs(Survey[,c("GPA","Height","TV","Piercings","MathSAT","VerbalSAT")])
### Load the lattice graphics package
p_load(lattice)
### Plot GPA as a function of a couple of variables. Include the regression line.
p1 = xyplot(GPA ~ Height, data=Survey, type=c("p","r"), aspect=1)
p2 = xyplot(GPA ~ MathSAT, data=Survey, type=c("p","r"), aspect=1)
### A boxplot works well for categorical/factor variables
p3 =bwplot(Sex~GPA, data=Survey, aspect=1)
### Plot the graphics in a sinle image to save space
print(p1, split = c(1, 1, 3, 1), more = TRUE)
print(p2, split = c(2, 1, 3, 1), more = TRUE)
print(p3, split = c(3, 1, 3, 1), more = FALSE)
### Fit a multiple linear regression model
GPA.lm = lm(GPA ~ Height + TV + Piercings + MathSAT + VerbalSAT, data=Survey)
### Get the parameter estimates, standard errors, t-stats, and p-vals
summary(GPA.lm)
##
## Call:
## lm(formula = GPA ~ Height + TV + Piercings + MathSAT + VerbalSAT,
## data = Survey)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10364 -0.23038 0.02313 0.27887 0.92934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4380195 0.4605779 5.293 2.19e-07 ***
## Height -0.0105225 0.0058703 -1.792 0.07397 .
## TV -0.0046027 0.0036572 -1.259 0.20909
## Piercings 0.0064355 0.0113332 0.568 0.57053
## MathSAT 0.0009516 0.0003397 2.801 0.00539 **
## VerbalSAT 0.0014799 0.0003155 4.690 3.99e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3677 on 332 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.1606, Adjusted R-squared: 0.1479
## F-statistic: 12.7 on 5 and 332 DF, p-value: 2.632e-11
### Fit a reduced model
GPA.lm = lm(GPA ~ Height + MathSAT + VerbalSAT, data=Survey)
### Get the parameter estimates, standard errors, t-stats, and p-vals
summary(GPA.lm)
##
## Call:
## lm(formula = GPA ~ Height + MathSAT + VerbalSAT, data = Survey)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08701 -0.23130 0.02617 0.26942 0.95687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6159121 0.3818689 6.850 3.56e-11 ***
## Height -0.0135121 0.0048959 -2.760 0.00610 **
## MathSAT 0.0008769 0.0003314 2.646 0.00852 **
## VerbalSAT 0.0015691 0.0003091 5.076 6.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3678 on 334 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.1554, Adjusted R-squared: 0.1479
## F-statistic: 20.49 on 3 and 334 DF, p-value: 3.27e-12
### Check residuals
p1 = xyplot(residuals(GPA.lm)~predict(GPA.lm))
p2 = xyplot(residuals(GPA.lm)~Survey$Height)
p3 = xyplot(residuals(GPA.lm)~Survey$MathSAT)
p4 = xyplot(residuals(GPA.lm)~Survey$VerbalSAT)
### Plot using split to get four plots in a single image
print(p1, split = c(1, 1, 2, 2), more = TRUE)
print(p2, split = c(2, 1, 2, 2), more = TRUE)
print(p3, split = c(1, 2, 2, 2), more = TRUE)
print(p4, split = c(2, 2, 2, 2), more = FALSE) # more = FALSE is redundant
### Plot the default residual plots in a single image
par(mfrow=c(2,2))
plot(GPA.lm)
par(mfrow=c(1,1))
### For fun, plot the plane of estimates determined by Math and Verbal SATs
p_load(scatterplot3d)
s3d = scatterplot3d(Survey$VerbalSAT, Survey$MathSAT, Survey$GPA, xlab="Verbal SAT", ylab="Math SAT", zlab="GPA", angle=65)
s3d$plane3d(lm(GPA ~ MathSAT + VerbalSAT, data=Survey))
par(mfrow=c(1,1))
We can estimate the number of points scored by the Golden State Warriors (2018-2019 regular season) using a multiple linear regression model.
### Plot all of the variables against each other
pairs(GSW[,c("Points","Rebounds","Steals","Blocks","Assists")])
### Load the lattice graphics package
p_load(lattice)
### Plot Points as a function of a couple of variables. Include the regression line. Use split to get them in a single graphic.
p1 = xyplot(Points ~ Rebounds, data=GSW, type=c("p","r"), aspect=1)
p2 = xyplot(Points ~ Assists, data=GSW, type=c("p","r"), aspect=1)
print(p1, split = c(1, 1, 2, 1), more = TRUE)
print(p2, split = c(2, 1, 2, 1), more = FALSE)
### Fit a multiple linear regression model
GSW.lm = lm(Points ~ Rebounds + Steals + Blocks + Assists, data=GSW)
### Get the parameter estimates, standard errors, t-stats, and p-vals
summary(GSW.lm)
##
## Call:
## lm(formula = Points ~ Rebounds + Steals + Blocks + Assists, data = GSW)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9152 -5.8121 -0.2544 5.4964 23.5621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.7579 10.2811 6.299 1.71e-08 ***
## Rebounds 0.1766 0.2070 0.853 0.3962
## Steals 0.6831 0.4055 1.684 0.0961 .
## Blocks -0.4222 0.5037 -0.838 0.4046
## Assists 1.4363 0.2237 6.421 1.01e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.41 on 77 degrees of freedom
## Multiple R-squared: 0.426, Adjusted R-squared: 0.3962
## F-statistic: 14.29 on 4 and 77 DF, p-value: 9.094e-09
### Try a reduced model
GSW.lm = lm(Points~Steals + Assists, data=GSW)
summary(GSW.lm)
##
## Call:
## lm(formula = Points ~ Steals + Assists, data = GSW)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9611 -6.4733 0.0576 5.7018 23.8716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.0135 6.4874 10.792 <2e-16 ***
## Steals 0.6262 0.4008 1.562 0.122
## Assists 1.4577 0.2102 6.933 1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.36 on 79 degrees of freedom
## Multiple R-squared: 0.416, Adjusted R-squared: 0.4012
## F-statistic: 28.13 on 2 and 79 DF, p-value: 5.941e-10
### Check residuals
p1 = xyplot(residuals(GSW.lm)~predict(GSW.lm))
p2 = xyplot(residuals(GSW.lm)~GSW$Assists)
p3 = xyplot(residuals(GSW.lm)~GSW$Steals)
### Plot lattice plots in single graphic image
print(p1, split = c(1, 1, 2, 2), more = TRUE)
print(p2, split = c(2, 1, 2, 2), more = TRUE)
print(p3, split = c(1, 2, 2, 2), more = FALSE)
### Use base plot to get default residual plots in a single graphic
par(mfrow=c(2,2))
plot(GSW.lm)
par(mfrow=c(1,1))
### For fun, plot the plane of estimates determined by Rebounds and Assists
p_load(scatterplot3d)
s3d = scatterplot3d(GSW$Steals, GSW$Assists, GSW$Points, xlab="Steals", ylab="Assists", zlab="Points", angle=60)
s3d$plane3d(lm(Points ~ Steals + Assists, data=GSW))